Here we present an example analysis of 10 million PBMCs treated with cytokines and processed through Parse Biosciences Evercode workflow. We use the python package Scanpy. We will demonstrate steps starting from loading in the data, to preprocessing, and finally to clustering.¶

In [1]:
import os
import sys
import scipy
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.io as sio
import scanpy.external as sce
import matplotlib.pyplot as plt
import time

sc.settings.verbosity = 1
obj_save_path = '/10M_dataset/analysis/tutorial/'

# Adjust Scanpy figure defaults
sc.settings.set_figure_params(dpi=100, fontsize=10, dpi_save=400,
    facecolor = 'white', figsize=(6,6), format='png')

Function to get std across rows in a sparse_csr matrix. This will help with replacing standard Scanpy Scale and PCA functions in order to use less memory.¶

In [2]:
def sparse_std(X,blocksize=100):
    """
    Return the std deviation of the columns of a sparse matrix.
    
    inputs
        X: sparse matrix

        blocksize: number of columns to calculate in parallel. Larger
        blocksize will increase memory usage as this function converts 
        each block to a dense matrix and uses the numpy .std() function
    
    outputs
        X_std: numpy vector with std deviation of each column of X
    
    """
    n,m = X.shape
    blocksize = 100
    k = int(m/blocksize)
    X_std = []
    for i in range(k):
        X_std += list(X[:,blocksize*i:blocksize*(i+1)].todense().std(0).A1)
    X_std += list(X[:,k*blocksize:].todense().std(0).A1)
    X_std = np.array(X_std)
    return X_std

IRLB functions to replace normal scanpy scale and pca steps in order to reduce memory usage¶

In [3]:
#   irlbpy code
#       From:   https://github.com/airysen/irlbpy
#       Date:   2021-12
#       License: Apache License, V 2.0, January 2004
#
#       Code unmodified except:
#           Added two print (feedback) blocks
#           Ran through lint formatting tool 'black' https://github.com/psf/black
#
import numpy as np
import scipy.sparse as sparse
import warnings

from numpy.fft import rfft, irfft
import numpy.linalg as nla


# Matrix-vector product wrapper
# A is a numpy 2d array or matrix, or a scipy matrix or sparse matrix.
# x is a numpy vector only.
# Compute A.dot(x) if t is False,  A.transpose().dot(x)  otherwise.


def multA(A, x, TP=False, L=None):
    if sparse.issparse(A):
        # m = A.shape[0]
        # n = A.shape[1]
        if TP:
            return sparse.csr_matrix(x).dot(A).transpose().todense().A[:, 0]
        return A.dot(sparse.csr_matrix(x).transpose()).todense().A[:, 0]
    if TP:
        return x.dot(A)
    return A.dot(x)


def multS(s, v, L, TP=False):
    N = s.shape[0]
    vp = prepare_v(v, N, L, TP=TP)
    p = irfft(rfft(vp) * rfft(s))
    if not TP:
        return p[:L]
    return p[L - 1 :]


def prepare_s(s, L=None):
    N = s.shape[0]
    if L is None:
        L = N // 2
    K = N - L + 1
    return np.roll(s, K - 1)


def prepare_v(v, N, L, TP=False):
    v = v.flatten()[::-1]
    K = N - L + 1
    if TP:
        lencheck = L
        if v.shape[0] != lencheck:
            raise VectorLengthException(
                "Length of v must be  L (if transpose flag is True)"
            )
        pw = K - 1
        v = np.pad(v, (pw, 0), mode="constant", constant_values=0)
    elif not TP:
        lencheck = N - L + 1
        if v.shape[0] != lencheck:
            raise VectorLengthException("Length of v must be N-K+1")
        pw = L - 1
        v = np.pad(v, (0, pw), mode="constant", constant_values=0)
    return v


def orthog(Y, X):
    """Orthogonalize a vector or matrix Y against the columns of the matrix X.
    This function requires that the column dimension of Y is less than X and
    that Y and X have the same number of rows.
    """
    dotY = multA(X, Y, TP=True)
    return Y - multA(X, dotY)


# Simple utility function used to check linear dependencies during computation:


def invcheck(x):
    # eps2 = 2 * np.finfo(np.float).eps
    eps2 = 2 * np.finfo(float).eps
    if x > eps2:
        x = 1 / x
    else:
        x = 0
        warnings.warn("Ill-conditioning encountered, result accuracy may be poor")
    return x


def lanczos(A, nval, tol=0.0001, maxit=50, center=None, scale=None, L=None):
    """Estimate a few of the largest singular values and corresponding singular
    vectors of matrix using the implicitly restarted Lanczos bidiagonalization
    method of Baglama and Reichel, see:

    Augmented Implicitly Restarted Lanczos Bidiagonalization Methods,
    J. Baglama and L. Reichel, SIAM J. Sci. Comput. 2005

    Keyword arguments:
    tol   -- An estimation tolerance. Smaller means more accurate estimates.
    maxit -- Maximum number of Lanczos iterations allowed.

    Given an input matrix A of dimension j * k, and an input desired number
    of singular values n, the function returns a tuple X with five entries:

    X[0] A j * nu matrix of estimated left singular vectors.
    X[1] A vector of length nu of estimated singular values.
    X[2] A k * nu matrix of estimated right singular vectors.
    X[3] The number of Lanczos iterations run.
    X[4] The number of matrix-vector products run.

    The algorithm estimates the truncated singular value decomposition:
    A.dot(X[2]) = X[0]*X[1].
    """

    import sys

    print(
        f">> lanczos A={A.shape}, nval={nval}, tol={tol}, maxit={maxit}",
        file=sys.stdout,
    )
    center_story = "None" if center is None else f"{center.shape}"
    scale_story = "None" if scale is None else f"{scale.shape}"
    print(f"++ lanczos center={center_story}, scale={scale_story}", file=sys.stdout)
    sys.stdout.flush()

    mmult = None
    m = None
    n = None
    if A.ndim == 2:
        mmult = multA
        m = A.shape[0]
        n = A.shape[1]
        if min(m, n) < 2:
            raise MatrixShapeException("The input matrix must be at least 2x2.")

    elif A.ndim == 1:
        mmult = multS
        A = np.pad(A, (0, A.shape[0] % 2), mode="edge")
        N = A.shape[0]
        if L is None:
            L = N // 2
        K = N - L + 1
        m = L
        n = K
        A = prepare_s(A, L)
    elif A.ndim > 2:
        raise MatrixShapeException("The input matrix must be 2D array")
    nu = nval

    m_b = min((nu + 20, 3 * nu, n))  # Working dimension size
    mprod = 0
    it = 0
    j = 0
    k = nu
    smax = 1
    # sparse = sparse.issparse(A)

    V = np.zeros((n, m_b))
    W = np.zeros((m, m_b))
    F = np.zeros((n, 1))
    B = np.zeros((m_b, m_b))

    V[:, 0] = np.random.randn(n)  # Initial vector
    V[:, 0] = V[:, 0] / np.linalg.norm(V)

    while it < maxit:
        if it > 0:
            j = k

        VJ = V[:, j]

        # apply scaling
        if scale is not None:
            VJ = VJ / scale

        W[:, j] = mmult(A, VJ, L=L)
        mprod = mprod + 1

        # apply centering
        # R code: W[, j_w] <- W[, j_w] - ds * drop(cross(dv, VJ)) * du
        if center is not None:
            W[:, j] = W[:, j] - np.dot(center, VJ)

        if it > 0:
            # NB W[:,0:j] selects columns 0,1,...,j-1
            W[:, j] = orthog(W[:, j], W[:, 0:j])
        s = np.linalg.norm(W[:, j])
        sinv = invcheck(s)
        W[:, j] = sinv * W[:, j]

        # Lanczos process
        while j < m_b:
            F = mmult(A, W[:, j], TP=True, L=L)
            mprod = mprod + 1

            # apply scaling
            if scale is not None:
                F = F / scale

            F = F - s * V[:, j]
            F = orthog(F, V[:, 0 : j + 1])
            fn = np.linalg.norm(F)
            fninv = invcheck(fn)
            F = fninv * F
            if j < m_b - 1:
                V[:, j + 1] = F
                B[j, j] = s
                B[j, j + 1] = fn
                VJp1 = V[:, j + 1]

                # apply scaling
                if scale is not None:
                    VJp1 = VJp1 / scale

                W[:, j + 1] = mmult(A, VJp1, L=L)
                mprod = mprod + 1

                # apply centering
                # R code: W[, jp1_w] <- W[, jp1_w] - ds * drop(cross(dv, VJP1))
                # * du
                if center is not None:
                    W[:, j + 1] = W[:, j + 1] - np.dot(center, VJp1)

                # One step of classical Gram-Schmidt...
                W[:, j + 1] = W[:, j + 1] - fn * W[:, j]
                # ...with full reorthogonalization
                W[:, j + 1] = orthog(W[:, j + 1], W[:, 0 : (j + 1)])
                s = np.linalg.norm(W[:, j + 1])
                sinv = invcheck(s)
                W[:, j + 1] = sinv * W[:, j + 1]
            else:
                B[j, j] = s
            j = j + 1
        # End of Lanczos process
        S = nla.svd(B)
        R = fn * S[0][m_b - 1, :]  # Residuals
        if it == 0:
            smax = S[1][0]  # Largest Ritz value
        else:
            smax = max((S[1][0], smax))

        conv = sum(np.abs(R[0:nu]) < tol * smax)
        if conv < nu:  # Not coverged yet
            k = max(conv + nu, k)
            k = min(k, m_b - 3)
        else:
            break
        # Update the Ritz vectors
        V[:, 0:k] = V[:, 0:m_b].dot(S[2].transpose()[:, 0:k])
        V[:, k] = F
        B = np.zeros((m_b, m_b))
        # Improve this! There must be better way to assign diagonal...
        for l in range(k):
            B[l, l] = S[1][l]
        B[0:k, k] = R[0:k]
        # Update the left approximate singular vectors
        W[:, 0:k] = W[:, 0:m_b].dot(S[0][:, 0:k])
        it = it + 1

    U = W[:, 0:m_b].dot(S[0][:, 0:nu])
    V = V[:, 0:m_b].dot(S[2].transpose()[:, 0:nu])
    # return((U, S[1][0:nu], V, it, mprod))

    print(f"<< lanczos it={it} mprod={mprod}", file=sys.stdout)
    sys.stdout.flush()

    return LanczosResult(
        **{"U": U, "s": S[1][0:nu], "V": V, "steps": it, "nmult": mprod}
    )


class LanczosResult:
    def __init__(self, **kwargs):
        for key in kwargs:
            setattr(self, key, kwargs[key])


class VectorLengthException(Exception):
    pass


class MatrixShapeException(Exception):
    pass

Read in anndata object containing about 10 million PBMCs that have been treated with various cytokines and processed through the Parse Biosciences Evercode workflow in a single experiment.¶

In [4]:
%%time
adata = sc.read_h5ad(obj_save_path + "Parse_10M_PBMC_cytokines.h5ad")
CPU times: user 25.9 s, sys: 1min 36s, total: 2min 2s
Wall time: 4min 40s
In [5]:
adata
Out[5]:
AnnData object with n_obs × n_vars = 9697974 × 40352
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells'

Log normalize the counts matrix¶

In [6]:
%%time
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
CPU times: user 32.9 s, sys: 28.5 s, total: 1min 1s
Wall time: 1min 1s
In [7]:
adata
Out[7]:
AnnData object with n_obs × n_vars = 9697974 × 40352
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells'
    uns: 'log1p'

Determine the highly variable genes and then subset the AnnData object on these genes¶

In [8]:
%%time
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.25)
sc.pl.highly_variable_genes(adata) 
No description has been provided for this image
CPU times: user 3min 24s, sys: 1min 27s, total: 4min 52s
Wall time: 2min 32s
In [9]:
%%time
adata = adata[:, adata.var.highly_variable].copy()
CPU times: user 1min 37s, sys: 10.5 s, total: 1min 47s
Wall time: 1min 47s
In [10]:
adata
Out[10]:
AnnData object with n_obs × n_vars = 9697974 × 5412
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
In [11]:
%%time
adata.write_h5ad(obj_save_path + "adata_post_hvg.h5ad")
CPU times: user 4.65 s, sys: 27.5 s, total: 32.1 s
Wall time: 32.5 s

Replace scanpy.pp.scale and scanpy.pp.pca with the following code to use less memory¶

In [12]:
%%time
use_hv = True
adata.uns['pca'] = {}
adata.uns['pca']['params'] = {
    'zero_center': True,
    'use_highly_variable': use_hv,
}

adata.var['mean'] = adata.X.mean(0).A1
adata.var['std'] = sparse_std(adata.X)

S = lanczos(adata.X,50,center=adata.var['mean'],scale=adata.var['std'])

adata.obsm['X_pca'] = (S.U * S.s)
adata.varm['PCs'] = S.V
adata.uns['pca']['variance'] = adata.obsm['X_pca'].var(0)
adata.uns['pca']['variance_ratio'] = adata.uns['pca']['variance'] / (adata.X.shape[1] - 1 )
>> lanczos A=(9697974, 5412), nval=50, tol=0.0001, maxit=50
++ lanczos center=(5412,), scale=(5412,)
<< lanczos it=16 mprod=236
CPU times: user 1h 54min 28s, sys: 26min 59s, total: 2h 21min 28s
Wall time: 1h 22min 19s
In [13]:
adata.write_h5ad(obj_save_path + 'adata_post_pca.h5ad')
In [14]:
adata
Out[14]:
AnnData object with n_obs × n_vars = 9697974 × 5412
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

Continue with neighbors, umap, and leiden¶

In [15]:
%%time
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
/home/ubuntu/miniconda3/envs/spipe3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
CPU times: user 35min 8s, sys: 1min 29s, total: 36min 37s
Wall time: 35min 59s
In [16]:
adata
Out[16]:
AnnData object with n_obs × n_vars = 9697974 × 5412
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors'
    obsm: 'X_pca'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [17]:
%%time
sc.tl.umap(adata)
CPU times: user 4h 38min 12s, sys: 2min 53s, total: 4h 41min 5s
Wall time: 2h 18min 48s
In [18]:
adata
Out[18]:
AnnData object with n_obs × n_vars = 9697974 × 5412
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [19]:
adata.write_h5ad(obj_save_path + 'adata_post_umap.h5ad')
In [22]:
%%time
sc.tl.leiden(adata, resolution=1.0, flavor="igraph", n_iterations=2)
CPU times: user 12min 44s, sys: 1min 38s, total: 14min 23s
Wall time: 14min 14s
In [23]:
adata
Out[23]:
AnnData object with n_obs × n_vars = 9697974 × 5412
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [24]:
%%time
adata.write_h5ad(obj_save_path + 'adata_post_leiden.h5ad')
CPU times: user 5.67 s, sys: 31.6 s, total: 37.2 s
Wall time: 37.8 s

Plotting¶

In [25]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8)
No description has been provided for this image
CPU times: user 22.1 s, sys: 1.01 s, total: 23.1 s
Wall time: 23.1 s
In [26]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8, legend_loc = "on data")
No description has been provided for this image
CPU times: user 22.6 s, sys: 1.05 s, total: 23.6 s
Wall time: 23.6 s
In [27]:
%%time
sc.pl.umap(adata, color='donor', legend_fontsize=8)
No description has been provided for this image
CPU times: user 22 s, sys: 985 ms, total: 23 s
Wall time: 23 s

If it is desirable to remove the batch effects of having different PBMC donors, one can run harmony integration. Due to technical variation and differences in underlying biology between PBMC donors, integrating with harmony can help users more easily identify which clusters correspond to which cell type.¶

In [2]:
%%time
adata = sc.read(obj_save_path + "adata_post_leiden.h5ad")
adata
CPU times: user 18.8 s, sys: 1min 24s, total: 1min 43s
Wall time: 7min 26s
Out[2]:
AnnData object with n_obs × n_vars = 9697974 × 5412
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
In [3]:
%%time
sce.pp.harmony_integrate(adata, 'donor')
2024-10-31 16:46:56,372 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-10-31 16:52:35,535 - harmonypy - INFO - sklearn.KMeans initialization complete.
2024-10-31 16:53:01,809 - harmonypy - INFO - Iteration 1 of 10
2024-10-31 17:36:44,967 - harmonypy - INFO - Iteration 2 of 10
2024-10-31 18:33:58,711 - harmonypy - INFO - Iteration 3 of 10
2024-10-31 19:24:56,606 - harmonypy - INFO - Iteration 4 of 10
2024-10-31 20:19:26,464 - harmonypy - INFO - Iteration 5 of 10
2024-10-31 21:05:19,316 - harmonypy - INFO - Iteration 6 of 10
2024-10-31 21:49:58,713 - harmonypy - INFO - Iteration 7 of 10
2024-10-31 22:04:55,678 - harmonypy - INFO - Converged after 7 iterations
CPU times: user 1d 21min 24s, sys: 56min 47s, total: 1d 1h 18min 12s
Wall time: 5h 18min 4s
In [4]:
%%time
adata.write_h5ad(obj_save_path + 'adata_post_harmony.h5ad')
CPU times: user 12.8 s, sys: 48.8 s, total: 1min 1s
Wall time: 1min 3s
In [ ]:
 

Continue with neighbors. To use the results of harmony integration, we set use_rep = "X_pca_harmony"¶

In [5]:
%%time
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30, use_rep="X_pca_harmony")
/home/ubuntu/miniconda3/envs/spipe3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
CPU times: user 46min 16s, sys: 2min 43s, total: 49min
Wall time: 47min 13s
In [6]:
adata
Out[6]:
AnnData object with n_obs × n_vars = 9697974 × 5412
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

Continue with umap and leiden clustering.¶

In [7]:
%%time
sc.tl.umap(adata)
CPU times: user 4h 33min 44s, sys: 4min 13s, total: 4h 37min 57s
Wall time: 2h 25min 24s
In [8]:
adata
Out[8]:
AnnData object with n_obs × n_vars = 9697974 × 5412
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
In [9]:
%%time
adata.write_h5ad(obj_save_path + 'adata_post_harmony_post_umap.h5ad')
CPU times: user 8.69 s, sys: 1min 19s, total: 1min 28s
Wall time: 1min 30s
In [10]:
%%time
sc.tl.leiden(adata, resolution=1.0, flavor="igraph", n_iterations=2)
CPU times: user 14min 2s, sys: 46.1 s, total: 14min 48s
Wall time: 14min 38s
In [11]:
adata
Out[11]:
AnnData object with n_obs × n_vars = 9697974 × 5412
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
In [12]:
%%time
adata.write_h5ad(obj_save_path + 'adata_post_harmony_post_leiden.h5ad')
CPU times: user 10.6 s, sys: 47.5 s, total: 58.1 s
Wall time: 59.7 s
In [13]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8, legend_loc = "on data")
No description has been provided for this image
CPU times: user 24.5 s, sys: 1.33 s, total: 25.8 s
Wall time: 25.8 s
In [14]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8)
No description has been provided for this image
CPU times: user 24.5 s, sys: 862 ms, total: 25.4 s
Wall time: 25.4 s

We now plot by donor to show the results of harmony integration.¶

In [15]:
%%time
sc.pl.umap(adata, color='donor', legend_fontsize=8)
No description has been provided for this image
CPU times: user 24.5 s, sys: 833 ms, total: 25.3 s
Wall time: 25.3 s

The first and most notable effect of running the integration function to is seeing how the samples overlap with another, which simplifies the process of annotating cell types as it reduces the number of clusters. In this case, we don't see a drastic reduction in numbers of clusters, but in scenarios where we see greater technical and biological variation, this can have a much greater impact.¶

We now overlay cell type annotations on the umap plot, which were determined using standard marker genes for PBMCs.¶

In [16]:
%%time
sc.pl.umap(adata, color='cell_type', legend_fontsize=8, legend_loc = "on data")
No description has been provided for this image
CPU times: user 24.9 s, sys: 954 ms, total: 25.9 s
Wall time: 25.9 s
In [17]:
%%time
sc.pl.umap(adata, color='cell_type', legend_fontsize=8)
No description has been provided for this image
CPU times: user 24.7 s, sys: 937 ms, total: 25.7 s
Wall time: 25.6 s

We can also plot the cytokines¶

In [4]:
%%time
sc.pl.umap(adata, color='cytokine', legend_fontsize=8)
No description has been provided for this image
CPU times: user 25.1 s, sys: 430 ms, total: 25.5 s
Wall time: 25.6 s
In [ ]: